{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n",
      "18846\n",
      "18846\n"
     ]
    }
   ],
   "source": [
    "news = fetch_20newsgroups(subset='all')\n",
    "print(news.target_names)\n",
    "print(len(news.data))\n",
    "print(len(news.target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20\n"
     ]
    }
   ],
   "source": [
    "print(len(news.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>\\nSubject: Pens fans reactions\\nOrganization: Post Office, Carnegie Mellon, Pittsburgh, PA\\nLines: 12\\nNNTP-Posting-Host: po4.andrew.cmu.edu\\n\\n\\n\\nI am sure some bashers of Pens fans are pretty confused about the lack\\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\\nare killing those Devils worse than I thought. Jagr just showed you why\\nhe is much better than his regular season stats. He is also a lot\\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\\nregular season game.          PENS RULE!!!\\n\\n\""
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news.data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10\n",
      "rec.sport.hockey\n"
     ]
    }
   ],
   "source": [
    "print(news.target[0])\n",
    "print(news.target_names[news.target[0]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "x_train,x_test,y_train,y_test = train_test_split(news.data,news.target)\n",
    "# train = fetch_20newsgroups(subset='train')\n",
    "# x_train = train.data\n",
    "# y_train = train.target\n",
    "# test = fetch_20newsgroups(subset='test')\n",
    "# x_test = test.data\n",
    "# y_test = test.target"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "CountVectorizer方法构建单词的字典，每个单词实例被转换为特征向量的一个数值特征，每个元素是特定单词在文本中出现的次数  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['bird', 'cat', 'dog', 'fish']\n",
      "[[0 1 1 1]\n",
      " [0 2 1 0]\n",
      " [1 0 0 1]\n",
      " [1 0 0 0]]\n",
      "[2 3 2 2]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "texts=[\"dog cat fish\",\"dog cat cat\",\"fish bird\", 'bird']\n",
    "cv = CountVectorizer()\n",
    "cv_fit=cv.fit_transform(texts)\n",
    "\n",
    "# \n",
    "print(cv.get_feature_names())\n",
    "print(cv_fit.toarray())\n",
    "\n",
    "print(cv_fit.toarray().sum(axis=0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.811\n"
     ]
    }
   ],
   "source": [
    "from sklearn import model_selection \n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "cv = CountVectorizer()\n",
    "cv_data = cv.fit_transform(x_train)\n",
    "mul_nb = MultinomialNB()\n",
    "\n",
    "scores = model_selection.cross_val_score(mul_nb, cv_data, y_train, cv=3, scoring='accuracy')  \n",
    "print(\"Accuracy: %0.3f\" % (scores.mean())) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TfidfVectorizer使用了一个高级的计算方法，称为Term Frequency Inverse Document  \n",
    "Frequency (TF-IDF)。这是一个衡量一个词在文本或语料中重要性的统计方法。直觉上讲，该方法通过比较在整个语料库的词的频率，寻求在当前文档中频率较高的词。这是一种将结果进行标准化的方法，可以避免因为有些词出现太过频繁而对一个实例的特征化作用不大的情况(我猜测比如a和and在英语中出现的频率比较高，但是它们对于表征一个文本的作用没有什么作用)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'brown': 0, 'dog': 1, 'over': 5, 'the': 7, 'jumped': 3, 'quick': 6, 'fox': 2, 'lazy': 4}\n",
      "[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718\n",
      " 1.69314718 1.        ]\n",
      "(1, 8)\n",
      "[[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646\n",
      "  0.36388646 0.42983441]]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "F:\\Anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1059: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "# 文本文档列表\n",
    "text = [\"The quick brown fox jumped over the lazy dog.\",\n",
    "\"The dog.\",\n",
    "\"The fox\"]\n",
    "# 创建变换函数\n",
    "vectorizer = TfidfVectorizer()\n",
    "# 词条化以及创建词汇表\n",
    "vectorizer.fit(text)\n",
    "# 总结\n",
    "print(vectorizer.vocabulary_)\n",
    "print(vectorizer.idf_)\n",
    "# 编码文档\n",
    "vector = vectorizer.transform([text[0]])\n",
    "# 总结编码文档\n",
    "print(vector.shape)\n",
    "print(vector.toarray())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "F:\\Anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1059: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.821\n"
     ]
    }
   ],
   "source": [
    "# 创建变换函数\n",
    "vectorizer = TfidfVectorizer()\n",
    "# 词条化以及创建词汇表\n",
    "tfidf_train = vectorizer.fit_transform(x_train)\n",
    "\n",
    "scores = model_selection.cross_val_score(mul_nb, tfidf_train, y_train, cv=3, scoring='accuracy') \n",
    "print(\"Accuracy: %0.3f\" % (scores.mean())) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "F:\\Anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1059: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.898\n"
     ]
    }
   ],
   "source": [
    "def get_stop_words():\n",
    "    result = set()\n",
    "    for line in open('stopwords_en.txt', 'r').readlines():\n",
    "        result.add(line.strip())\n",
    "    return result\n",
    "\n",
    "# 加载停用词\n",
    "stop_words = get_stop_words()\n",
    "# 创建变换函数\n",
    "vectorizer = TfidfVectorizer(stop_words=stop_words)\n",
    "\n",
    "\n",
    "mul_nb = MultinomialNB(alpha=0.01)\n",
    "\n",
    "# 词条化以及创建词汇表\n",
    "tfidf_train = vectorizer.fit_transform(x_train)\n",
    "\n",
    "scores = model_selection.cross_val_score(mul_nb, tfidf_train, y_train, cv=3, scoring='accuracy') \n",
    "print(\"Accuracy: %0.3f\" % (scores.mean())) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "F:\\Anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1059: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9958256686005377\n",
      "0.9093803056027164\n"
     ]
    }
   ],
   "source": [
    "# 切分数据集\n",
    "tfidf_data = vectorizer.fit_transform(news.data)\n",
    "x_train,x_test,y_train,y_test = train_test_split(tfidf_data,news.target)\n",
    "\n",
    "mul_nb.fit(x_train,y_train)\n",
    "print(mul_nb.score(x_train, y_train))\n",
    "\n",
    "print(mul_nb.score(x_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
